library(tidyverse)
library(stringr)
library(randomForest)
library(vip)
library(rpart)
library(rpart.plot)
library(caret)
library(tidytext)
The dataset contains 2515 unique videos and their subtitles from over 91 different YouTubers, ranging from all different kinds of categories.
df_raw <- read.csv("data.csv")
head(df_raw)
Do a sentiment analysis on the subtitles and find the best multiple linear regression model to predict the number of views using Subscribers, CC, Released, Category, Sentiment and Length.
df_raw %>% select(Views, Subscribers, Released, Length) %>% head(10)
Looking at the data, we notice several problems in the data, like:
Views: The Views variable is in string format and the units are different, like “10K views”, “10M views”. We prefer it to be in number and in the same unit in order to conduct statistical analysis.
Subscribers: The same problems as Views. The Subscribers variable is like “10K subscribers”, “10M subscribers”.
Length: The video length is in string format, like “12:00”, “1:12:00”. We need it to be in number and in the same unit.
Released: The Released variable is in string format, like “2 years age”, “10 month ago”. We need it to be in number and in the same unit.
Therefore, we need to do data cleaning first.
# Unify units and convert string to number, like: 10K views -> 10, 10M views -> 10000
cleanViews <- function(str) {
str <- str_remove(str, " views")
last <- str_sub(str, -1)
views <- str %>% str_remove(last) %>% as.numeric()
if (last == "M") return(1000*views)
else return(views)
}
# Unify units and convert string to number, like: 10K subscribers -> 10, 10M subscribers -> 10000
cleanSubscribers <- function(str) {
str <- str_remove(str, " subscribers")
last <- str_sub(str, -1)
views <- str %>% str_remove(last) %>% as.numeric()
if (last == "M") return(1000*views)
else return(views)
}
# Convert time in string format to number of minutes, like: 12:00 -> 12, 1:12:00 -> 72
cleanLength <- function(str) {
list <- str_split(str, ":")
len <- length(list[[1]])
if (len == 3) {
h <- as.numeric(list[[1]][1])
m <- as.numeric(list[[1]][2])
return((m + 1) + 60*m)
} else {
m <- as.numeric(list[[1]][1])
return(m+1)
}
}
# Convert time to number of months ago, like: 1 years ago -> 12, 10 months ago to 10
cleanReleased <- function(str) {
str <- str_remove(str, "Streamed ")
list <- str_split(str, " ")
if (list[[1]][2] == "years") return(as.numeric(list[[1]][1])*12)
else return(as.numeric(list[[1]][1]))
}
# Remove NAs
df <- df_raw %>%
filter(
!is.na(Released) & Released != ""
)
# Clean the data
df <- df %>% mutate(
Views = map_dbl(Views, cleanViews),
Subscribers = map_dbl(Subscribers, cleanSubscribers),
Length = map_dbl(Length, cleanLength),
Released = map_dbl(Released, cleanReleased)
)
df %>% select(Views, Subscribers, Released, Length) %>% head(10)
# Save for future use
write_csv(df, "cleaned_data.csv")
After cleaning, Views, Subscribers, Released and Length are numbers, while Views, Subscribers are in K and Released and Length are in minute.
df <- read_csv("cleaned_data.csv")
## Rows: 2098 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Id, Channel, Title, URL, Category, Transcript
## dbl (5): Subscribers, CC, Released, Views, Length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df$CC <- as.factor(df$CC)
df$Category <- as.factor(df$Category)
df$Subscribers <- as.numeric(df$Subscribers)
head(df)
table(df$CC)
##
## 0 1
## 1094 1004
boxplot(log(Views) ~ CC, data = df)
summary(df$Length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 11.00 26.94 17.00 3539.00
summary(df$Released)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 24.00 36.00 46.46 60.00 156.00
summary(df$Subscribers)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 179 1730 4010 6871 8240 89700
pairs(Views ~ CC + Released + Length + Subscribers + Category, data=df)
TODO by Xinyi
df_script <- df %>%
select(Title, Transcript)
head(df_script)
data("stop_words")
custom_stop_words <- rbind(stop_words, c("_", "custom"))
#bigram
df_script %>%
group_by(Title) %>%
unnest_tokens(word, Transcript, token = "ngrams", n = 2)
df_script<- df_script %>%
group_by(Title) %>%
unnest_tokens(word, Transcript) %>%
anti_join(custom_stop_words) %>%
count(word, sort = TRUE) %>%
mutate(total = sum(n)) %>%
ungroup()
## Joining, by = "word"
#inner_join(get_sentiments("afinn")) #%>%
#summarise(sentiment = sum(value))
#mutate(total = sum(word)) %>%
#mutate(perc = round(n/total, 2))
head(df_script)
df_script %>%
group_by(Title) %>%
inner_join(get_sentiments("afinn")) %>%
mutate(afinn_score = sum(value)) %>%
mutate(perc = round(n/total, 2))
## Joining, by = "word"
Randomly split the data set in a 70% training and 30% test set. Make sure to use set.seed() so that your results are reproducible
set.seed(652)
n <- nrow(df)
train_index <- sample(1:n, round(0.7*n))
df_train <- df[train_index,]
df_test <- df[-train_index,]
lm1 <- lm(Views ~ CC + Released + Length + Subscribers, data=df)
summary(lm1)
##
## Call:
## lm(formula = Views ~ CC + Released + Length + Subscribers, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27428 -4097 -1619 1845 299580
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 996.87365 540.88021 1.843 0.06546 .
## CC1 1634.75349 533.44242 3.065 0.00221 **
## Released 20.86454 9.12050 2.288 0.02226 *
## Length -2.23572 1.44601 -1.546 0.12222
## Subscribers 1.37690 0.02726 50.517 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11950 on 2093 degrees of freedom
## Multiple R-squared: 0.561, Adjusted R-squared: 0.5602
## F-statistic: 668.7 on 4 and 2093 DF, p-value: < 2.2e-16
# Fit tree model
t1 <- rpart(Views ~ CC + Released + Category + Length + Subscribers,
data = df_train,
method = "anova")
# Plot the desicion tree
rpart.plot(t1)
# Plot R-square vs Splits and the Relative Error vs Splits.
rsq.rpart(t1)
##
## Regression tree:
## rpart(formula = Views ~ CC + Released + Category + Length + Subscribers,
## data = df_train, method = "anova")
##
## Variables actually used in tree construction:
## [1] Category Length Subscribers
##
## Root node error: 5.1207e+11/1469 = 348582113
##
## n= 1469
##
## CP nsplit rel error xerror xstd
## 1 0.310493 0 1.00000 1.00150 0.24648
## 2 0.110528 1 0.68951 0.73970 0.19123
## 3 0.067016 2 0.57898 0.65220 0.19748
## 4 0.044738 3 0.51196 0.60975 0.19366
## 5 0.031356 4 0.46722 0.59609 0.19321
## 6 0.017075 5 0.43587 0.55910 0.19210
## 7 0.016164 6 0.41879 0.56347 0.19921
## 8 0.010000 7 0.40263 0.54809 0.19903
# Make prediction
pred_tree <- predict(t1, newdata = df_test)
# Compute the RMSE
RMSE(df_test$Views, pred_tree)
## [1] 7977.73
set.seed(652)
rf1 <- randomForest(Views ~ CC + Released + Category + Length + Subscribers, importance = TRUE, data = df_train)
rf1
##
## Call:
## randomForest(formula = Views ~ CC + Released + Category + Length + Subscribers, data = df_train, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 175526230
## % Var explained: 49.65
Use the vip() function to make a variable importance plot. Which variables are most important?
vip(rf1, num_features = 14, include_type = TRUE)
# Make prediction
pred_rf <- predict(rf1, newdata = df_test)
# Compute the RMSE
RMSE(df_test$Views, pred_rf)
## [1] 7645.176
| Model | RMSE | R Squared | Number of Coefficients | performance | interpretability |
|---|---|---|---|---|---|
| linear regression | - | - | - | - | - |
| regression tree | - | - | - | - | - |
| random forest | - | - | - | - | - |
Aggregated/ensemble models are not universally better than their “single” counterparts, they are better if and only if the single models suffer of instability. With XX training rows and only XX columns, we are in a comfortable training sample size situation in which even a decision tree may get reasonably stable.